###############################################################################
# Main Analysis I
###############################################################################
###############################################################################
# All Models reported in Main Text, including their Tables
###############################################################################
###############################################################################
# Content
###############################################################################
# 1) Dependencies
# 2) Load Data
# 3) Modeling
# 4) Save Model Output in Tables
###############################################################################
# 1) Dependencies
###############################################################################
library(dplyr)
library(lubridate)
library(texreg)
library(MASS)
library(lme4)
###############################################################################
# 2) Load Data
###############################################################################
# Set Path
setwd(dirname(rstudioapi::getActiveDocumentContext()$path))
rm(list=ls())

set.seed(0213)
# Variant with articles that mention a person ####


### Overall (M vs F):
d_overall_i <- read.csv("../data/diff_in_diff_m_vs_f_no_split_no_presidents.csv")%>% 
  mutate(month = month(date),
         day = day(date)) %>% 
  mutate(date = as.Date(paste("2021",month,day, sep = '-')))
head(d_overall_i)

### Overall (M vs F vs NO Mention):
d_overall_ii <- read.csv("../data/diff_in_diff_m_vs_f_vs_NA_no_split_absolute_no_presidents.csv")%>% 
  mutate(month = month(date),
         day = day(date)) %>% 
  mutate(date = as.Date(paste("2021",month,day, sep = '-')))
head(d_overall_ii)

### Issues (M vs F):
d_issues_i <- read.csv("../data/diff_in_diff_m_vs_f_topic_split_no_presidents.csv")%>% 
  mutate(month = month(date),
         day = day(date)) %>% 
  mutate(date = as.Date(paste("2021",month,day, sep = '-')))
head(d_issues_i)

### Issues (M vs F vs NO Mention):
d_issues_ii <- read.csv("../data/diff_in_diff_m_vs_f_vs_NA_topic_split_no_presidents.csv")%>% 
  mutate(month = month(date),
         day = day(date)) %>% 
  mutate(date = as.Date(paste("2021",month,day, sep = '-')))
head(d_issues_ii)
###############################################################################
# 3) Modeling
###############################################################################
# --------------------------------------------------------------------------- #
# Data Transformation Overall
# --------------------------------------------------------------------------- #
d_overall_i <- d_overall_i %>% mutate(date = as.Date(date)) %>% 
                               tidyr::complete(date = seq.Date(min(date), max(date), by = "day"), 
                                               year, gender, 
                                               fill = list(n = 0,
                                                           sum_art = 0,
                                                           freq = 0,
                                                           perc = 0))

d_overall_i <- d_overall_i %>% mutate(month = month(date),
                                      day = day(date)) %>% 
  mutate(date_2 = as.Date(paste(year,month,day, sep = '-'))) %>%
  mutate(wday = weekdays(date_2))

d_overall_i <- d_overall_i %>%
              	filter(gender == "f") %>%
              	mutate(
              		treated = case_when(year == 2015 ~ 0, year == 2019 ~ 1),
              		post_treatment = case_when(date < "2021-06-14" ~ 0, 
              		                           date >= "2021-06-14" ~ 1),
              		                           #date >= "2021-06-14" & year == 2019 ~ 1, 
              		                           #date >= "2021-06-14" & year == 2015 ~ 0),
              		month = month(date),
              		week = week(date)
              	)
head(d_overall_i)
# --------------------------------------------------------------------------- #
# Data Transformation Overall (M vs F vs NO Mention)
# --------------------------------------------------------------------------- #
d_overall_ii <- d_overall_ii %>% mutate(date = as.Date(date)) %>% 
                                 tidyr::complete(date = seq.Date(min(date), max(date), by = "day"), 
                                                 year, 
                                                 gender, 
                                                 fill = list(n_2 = 0,
                                                             sum_day = 0,
                                                             freq = 0,
                                                             perc = 0))

d_overall_ii <- d_overall_ii %>% mutate(month = month(date),
                                        day = day(date)) %>% 
  mutate(date_2 = as.Date(paste(year,month,day, sep = '-'))) %>%
  mutate(wday = weekdays(date_2))

d_overall_ii <- d_overall_ii %>%
                  filter(gender == "f") %>%
                  mutate(
                    treated = case_when(year == 2015 ~ 0, year == 2019 ~ 1),
                    post_treatment = case_when(date < "2021-06-14" ~ 0, 
                                               date >= "2021-06-14" ~ 1),
                    #date >= "2021-06-14" & year == 2019 ~ 1, 
                    #date >= "2021-06-14" & year == 2015 ~ 0),
                    month = month(date),
                    week = week(date)
                  )
head(d_overall_ii)

# --------------------------------------------------------------------------- #
# Data Transformation Issues
# --------------------------------------------------------------------------- #
d_issues_i <- d_issues_i %>% mutate(date = as.Date(date)) %>% 
                             tidyr::complete(date = seq.Date(min(date), max(date), by = "day"), 
                                             year, 
                                             gender, 
                                             selectsclass,
                                             fill = list(n = 0,
                                                         sum_art_class = 0,
                                                         freq = 0,
                                                         perc = 0))

d_issues_i <- d_issues_i %>% mutate(month = month(date),
                                    day = day(date)) %>% 
  mutate(date_2 = as.Date(paste(year,month,day, sep = '-'))) %>%
  mutate(wday = weekdays(date_2))

d_issues_i <- d_issues_i %>%
                filter(gender == "f") %>%
                mutate(
                  treated = case_when(year == 2015 ~ 0, year == 2019 ~ 1),
                  post_treatment = case_when(date < "2021-06-14" ~ 0, 
                                             date >= "2021-06-14" ~ 1),
                  #date >= "2021-06-14" & year == 2019 ~ 1, 
                  #date >= "2021-06-14" & year == 2015 ~ 0),
                  month = month(date),
                  week = week(date)
                )
head(d_issues_i)

unique(d_issues_i$selectsclass)
#  [1] "NotPolitical"                       "PoliticalSystem"
#  [3] "PublicServices_Infrastructure"      "SocialSecurity_WelfareState"
#  [5] "Economy"                            "Regions_NationalCohesion"
#  [7] "Agriculture"                        "Education_Culture"
#  [9] "Environment_Energy"                 "EU_Europa"
# [11] "Finances_Taxes"                     "Immigration_Asylum"
# [13] "Law_Order"                          "Other_unclassified_Political_Texts"
# [15] "GenderIssues_Discrimination"        "PublicHealth"
# [17] "LabourMarket"                       "Other_Problems"
# [19] "Not Classified"                     "InternationalRelations"
# --------------------------------------------------------------------------- #
# Data Transformation Issues (M vs F vs NO Mention)
# --------------------------------------------------------------------------- #
d_issues_ii <- d_issues_ii %>% mutate(date = as.Date(date)) %>% 
                               tidyr::complete(date = seq.Date(min(date), max(date), by = "day"), 
                                               year, 
                                               gender, 
                                               selectsclass, 
                                               fill = list(n_2 = 0,
                                                           sum_day = 0,
                                                           freq = 0,
                                                           perc = 0))

d_issues_ii <- d_issues_ii %>% mutate(month = month(date),
                                      day = day(date)) %>% 
  mutate(date_2 = as.Date(paste(year,month,day, sep = '-'))) %>%
  mutate(wday = weekdays(date_2))

d_issues_ii <- d_issues_ii %>%
              filter(gender == "f") %>%
              mutate(
                treated = case_when(year == 2015 ~ 0, year == 2019 ~ 1),
                post_treatment = case_when(date < "2021-06-14" ~ 0, 
                                           date >= "2021-06-14" ~ 1),
                #date >= "2021-06-14" & year == 2019 ~ 1, 
                #date >= "2021-06-14" & year == 2015 ~ 0),
                month = month(date),
                week = week(date)
              )
head(d_issues_ii)

unique(d_issues_ii$selectsclass)
#  [1] "NotPolitical"                       "PoliticalSystem"
#  [3] "PublicServices_Infrastructure"      "SocialSecurity_WelfareState"
#  [5] "Economy"                            "Regions_NationalCohesion"
#  [7] "Agriculture"                        "Education_Culture"
#  [9] "Environment_Energy"                 "EU_Europa"
# [11] "Finances_Taxes"                     "Immigration_Asylum"
# [13] "Law_Order"                          "Other_unclassified_Political_Texts"
# [15] "GenderIssues_Discrimination"        "PublicHealth"
# [17] "LabourMarket"                       "Other_Problems"
# [19] "Not Classified"                     "InternationalRelations"

# --------------------------------------------------------------------------- #
# Overall Linear Regression Models
# --------------------------------------------------------------------------- #
out_overall_i <- lm(perc ~ treated * post_treatment + as.factor(month), data = d_overall_i)
summary(out_overall_i)

out_overall_i <- lm(perc ~ treated * post_treatment + as.factor(month), data = subset(d_overall_i, month %in% c(1,2,3,4,5,6,7,8,9,10,11)))
summary(out_overall_i)

out_overall_ib <- lm(perc ~ treated * post_treatment + as.factor(month) + as.factor(wday), data = d_overall_i)
summary(out_overall_ib)

out_overall_ib <- lm(perc ~ treated * post_treatment + as.factor(month)  + as.factor(wday), data = subset(d_overall_i, month %in% c(1,2,3,4,5,6,7,8,9,10,11)))
summary(out_overall_ib)
# --------------------------------------------------------------------------- #
# Overall Negative Binomial Regression Models
# --------------------------------------------------------------------------- #
out_overall_i_nb <- glm.nb(n ~ treated * post_treatment + as.factor(month), data = d_overall_i)
summary(out_overall_i_nb)

out_overall_i_nb <- glm.nb(n ~ treated * post_treatment + as.factor(month), data = subset(d_overall_i, month %in% c(1,2,3,4,5,6,7,8,9,10,11)))
summary(out_overall_i_nb)

out_overall_ib_nb <- glm.nb(n ~ treated * post_treatment + as.factor(month) + as.factor(wday), data = d_overall_i)
summary(out_overall_ib_nb)

out_overall_ib_nb <- glm.nb(n ~ treated * post_treatment + as.factor(month) + as.factor(wday), data = subset(d_overall_i, month %in% c(1,2,3,4,5,6,7,8,9,10,11)))
summary(out_overall_ib_nb)
# --------------------------------------------------------------------------- #
# Issues Linear Regression Models
# --------------------------------------------------------------------------- #
out_gender_i <- lm(perc ~ treated * post_treatment + as.factor(month) + as.factor(wday), subset(d_issues_i, selectsclass == "Gender"))
summary(out_gender_i)

out_eu_i <- lm(perc ~ treated * post_treatment + as.factor(month) + as.factor(wday), subset(d_issues_i, selectsclass == "Europe"))
summary(out_eu_i)

out_immigration_i <- lm(perc ~ treated * post_treatment + as.factor(month) + as.factor(wday), subset(d_issues_i, selectsclass == "Immigration"))
summary(out_immigration_i)

out_environment_i <- lm(perc ~ treated * post_treatment + as.factor(month) + as.factor(wday), subset(d_issues_i, selectsclass == "Environment"))
summary(out_environment_i)

out_economy_i <- lm(perc ~ treated * post_treatment + as.factor(month) + as.factor(wday), subset(d_issues_i, selectsclass == "Economy"))
summary(out_economy_i)

out_health_i <- lm(perc ~ treated * post_treatment + as.factor(month) + as.factor(wday), subset(d_issues_i, selectsclass == "Public Health"))
summary(out_health_i)
# --------------------------------------------------------------------------- #
# Issues Negative Binomial Regression Models
# --------------------------------------------------------------------------- #
out_gender_i_nb <- glm.nb(n ~ treated * post_treatment + as.factor(month) + as.factor(wday), subset(d_issues_i, selectsclass == "Gender"))
summary(out_gender_i_nb)

out_eu_i_nb <- glm.nb(n ~ treated * post_treatment + as.factor(month) + as.factor(wday), subset(d_issues_i, selectsclass == "Europe"))
summary(out_eu_i_nb)

out_immigration_i_nb <- glm.nb(n ~ treated * post_treatment + as.factor(month) + as.factor(wday), subset(d_issues_i, selectsclass == "Immigration"))
summary(out_immigration_i_nb)

out_environment_i_nb <- glm.nb(n ~ treated * post_treatment + as.factor(month) + as.factor(wday), subset(d_issues_i, selectsclass == "Environment"))
summary(out_environment_i_nb)

out_economy_i_nb <- glm.nb(n ~ treated * post_treatment + as.factor(month) + as.factor(wday), subset(d_issues_i, selectsclass == "Economy"))
summary(out_economy_i_nb)

out_health_i_nb <- glm.nb(n ~ treated * post_treatment + as.factor(month) + as.factor(wday), subset(d_issues_i, selectsclass == "Public Health"))
summary(out_health_i_nb)
# --------------------------------------------------------------------------- #
# Overall Linear Regression Models  (M vs F vs NO Mention)
# --------------------------------------------------------------------------- #
# Variant with all articles (including no mentions) ####
out_overall_ii <- lm(perc ~ treated * post_treatment + as.factor(month), data = d_overall_ii)
summary(out_overall_ii)

out_overall_ii <- lm(perc ~ treated * post_treatment + as.factor(month), data = subset(d_overall_ii, month %in% c(1,2,3,4,5,6,7,8,9,10,11)))
summary(out_overall_ii)

out_overall_iib <- lm(perc ~ treated * post_treatment + as.factor(month) + as.factor(wday), data = d_overall_ii)
summary(out_overall_iib)

out_overall_iib <- lm(perc ~ treated * post_treatment + as.factor(month) + as.factor(wday), data = subset(d_overall_ii, month %in% c(1,2,3,4,5,6,7,8,9,10,11)))
summary(out_overall_iib)
# --------------------------------------------------------------------------- #
# Overall Negative Binomial  Regression Models  (M vs F vs NO Mention)
# --------------------------------------------------------------------------- #
# Variant with all articles (including no mentions) ####
out_overall_ii_nb <- glm.nb(n_2 ~ treated * post_treatment + as.factor(month), data = d_overall_ii)
summary(out_overall_ii_nb)

out_overall_ii_nb <- glm.nb(n_2 ~ treated * post_treatment + as.factor(month), data = subset(d_overall_ii, month %in% c(1,2,3,4,5,6,7,8,9,10,11)))
summary(out_overall_ii_nb)

out_overall_iib_nb <- glm.nb(n_2 ~ treated * post_treatment + as.factor(month) + as.factor(wday), data = d_overall_ii)
summary(out_overall_iib_nb)

out_overall_iib_nb <- glm.nb(n_2 ~ treated * post_treatment + as.factor(month) + as.factor(wday), data = subset(d_overall_ii, month %in% c(1,2,3,4,5,6,7,8,9,10,11)))
summary(out_overall_iib_nb)
# --------------------------------------------------------------------------- #
# Issues Linear Regression Models  (M vs F vs NO Mention)
# --------------------------------------------------------------------------- #
out_gender_ii <- lm(perc ~ treated * post_treatment + as.factor(month) + as.factor(wday), subset(d_issues_ii, selectsclass == "Gender"))
summary(out_gender_ii)

out_eu_ii <- lm(perc ~ treated * post_treatment + as.factor(month) + as.factor(wday), subset(d_issues_ii, selectsclass == "Europe"))
summary(out_eu_ii)

out_immigration_ii <- lm(perc ~ treated * post_treatment + as.factor(month) + as.factor(wday), subset(d_issues_ii, selectsclass == "Immigration"))
summary(out_immigration_ii)

out_environment_ii <- lm(perc ~ treated * post_treatment + as.factor(month) + as.factor(wday), subset(d_issues_ii, selectsclass == "Environment"))
summary(out_environment_ii)

out_economy_ii <- lm(perc ~ treated * post_treatment + as.factor(month) + as.factor(wday), subset(d_issues_ii, selectsclass == "Economy"))
summary(out_economy_ii)

out_health_ii <- lm(perc ~ treated * post_treatment + as.factor(month) + as.factor(wday), subset(d_issues_ii, selectsclass == "Public Health"))
summary(out_health_ii)
# --------------------------------------------------------------------------- #
# Issues Negative Binomial Regression Models  (M vs F vs NO Mention)
# --------------------------------------------------------------------------- #
out_gender_ii_nb <- glm.nb(n_2 ~ treated * post_treatment + as.factor(month) + as.factor(wday), subset(d_issues_ii, selectsclass == "Gender"))
summary(out_gender_ii_nb)

out_eu_ii_nb <- glm.nb(n_2 ~ treated * post_treatment + as.factor(month) + as.factor(wday), subset(d_issues_ii, selectsclass == "Europe"))
summary(out_eu_ii_nb)

out_immigration_ii_nb <- glm.nb(n_2 ~ treated * post_treatment + as.factor(month) + as.factor(wday), subset(d_issues_ii, selectsclass == "Immigration"))
summary(out_immigration_ii_nb)

out_environment_ii_nb <- glm.nb(n_2 ~ treated * post_treatment + as.factor(month) + as.factor(wday), subset(d_issues_ii, selectsclass == "Environment"))
summary(out_environment_ii_nb)

out_economy_ii_nb <- glm.nb(n_2 ~ treated * post_treatment + as.factor(month) + as.factor(wday), subset(d_issues_ii, selectsclass == "Economy"))
summary(out_economy_ii_nb)

out_health_ii_nb <- glm.nb(n_2 ~ treated * post_treatment + as.factor(month) + as.factor(wday), subset(d_issues_ii, selectsclass == "Public Health"))
summary(out_health_ii_nb)
###############################################################################
# 4) Save Model Output in Tables
###############################################################################
## Tabelle
htmlreg(list(out_overall_i, out_overall_ib, out_gender_i, out_environment_i, out_eu_i, out_immigration_i, out_economy_i, out_health_i),
        file = "../tables_main/diff-in-diff-mf.html",
        custom.model.names = c("Overall","Overall (Weekdays FEs)", "Gender", "Environment", "Europe", "Immigration", "Economy", "Health")#,
        # omit.coef = "(partei_)|(kanton_)"
)


texreg::texreg(list(out_overall_i, out_overall_ib, out_gender_i, out_environment_i, out_eu_i, out_immigration_i),
               file="../tables_main/diff-in-diff-mf.tex",
               custom.model.names = c("Overall", "Overall", "Gender", "Environment", "Europe", "Immigration"),
               custom.gof.rows = list("Month FEs" = c("\\checkmark", "\\checkmark", "\\checkmark","\\checkmark", "\\checkmark","\\checkmark"),
                                      "Weekdays FEs" = c("", "\\checkmark", "\\checkmark", "\\checkmark", "\\checkmark", "\\checkmark")),
               custom.coef.map=list("treated"="Strike Year (2019)",
                                    "post_treatment"="After women's strike",
                                    "treated:post_treatment" = "Strike Year (2019) * After Women Strike",
                                    "(Intercept)"="(Intercept)"),
               label="table:diffindiff",
               float.pos="h",
               caption="Statistical OLS models of the share of articles mentioning at least one female candidate.")

## Tabelle
htmlreg(list(out_overall_ii, out_overall_iib, out_gender_ii, out_environment_ii, out_eu_ii, out_immigration_ii, out_economy_ii, out_health_ii),
        file = "../tables_main/diff-in-diff-mfNA.html",
        custom.model.names = c("Overall", "Overall (Weekdays FEs)", "Gender", "Environment", "Europe", "Immigration", "Economy", "Health")#,
        # omit.coef = "(partei_)|(kanton_)"
)


texreg::texreg(list(out_overall_ii, out_overall_iib, out_gender_ii, out_environment_ii, out_eu_ii, out_immigration_ii),
               file="../tables_main/diff-in-diff-mfNA.tex",
               custom.model.names = c("Overall", "Overall", "Gender", "Environment", "Europe", "Immigration"),
               custom.gof.rows = list("Month FEs" = c("\\checkmark", "\\checkmark", "\\checkmark","\\checkmark", "\\checkmark","\\checkmark"),
                                      "Weekdasy FEs" = c("", "\\checkmark", "\\checkmark", "\\checkmark", "\\checkmark", "\\checkmark")),
               custom.coef.map=list("treated"="Strike Year (2019)",
                                    "post_treatment"="After women's strike",
                                    "treated:post_treatment" = "Strike Year (2019) * After Women Strike",
                                    "(Intercept)"="(Intercept)"),
               label="table:appdiffindiff",
               float.pos="h",
               caption="Statistical OLS models of the share of articles mentioning at least one female candidate. Robustness check for Table \\ref{table:diffindiff} with share of all articles, rather than only those mentioning candidates.")

# ----------------------------------------------------------------------------#
# Linear Negative Binomial Models
# ----------------------------------------------------------------------------#
## Tabelle
htmlreg(list(out_overall_i_nb, out_overall_ib_nb, out_gender_i_nb, out_environment_i_nb, out_eu_i_nb, out_immigration_i_nb, out_economy_i_nb, out_health_i_nb),
        file = "../tables_main/diff-in-diff-mf-negbin.html",
        custom.model.names = c("Overall","Overall (Weekdays FEs)", "Gender", "Environment", "Europe", "Immigration", "Economy", "Health")#,
        # omit.coef = "(partei_)|(kanton_)"
)


texreg::texreg(list(out_overall_i_nb, out_overall_ib_nb, out_gender_i_nb, out_environment_i_nb, out_eu_i_nb, out_immigration_i_nb),
               file="../tables_main/diff-in-diff-mf-negbin.tex",
               custom.model.names = c("Overall", "Overall", "Gender", "Environment", "Europe", "Immigration"),
               custom.gof.rows = list("Month FEs" = c("\\checkmark", "\\checkmark", "\\checkmark","\\checkmark", "\\checkmark","\\checkmark"),
                                      "Weekdays FEs" = c("", "\\checkmark", "\\checkmark", "\\checkmark", "\\checkmark", "\\checkmark")),
               custom.coef.map=list("treated"="Strike Year (2019)",
                                    "post_treatment"="After women's strike",
                                    "treated:post_treatment" = "Strike Year (2019) * After Women Strike",
                                    "(Intercept)"="(Intercept)"),
               label="table:diffindiff",
               float.pos="h",
               caption="Statistical negative binomial models of the number of articles mentioning at least one female candidate.")

## Tabelle
htmlreg(list(out_overall_ii_nb, out_overall_iib_nb, out_gender_ii_nb, out_environment_ii_nb, out_eu_ii_nb, out_immigration_ii_nb, out_economy_ii_nb, out_health_ii_nb),
        file = "../tables_main/diff-in-diff-mfNA-negbin.html",
        custom.model.names = c("Overall", "Overall (Weekdays FEs)", "Gender", "Environment", "Europe", "Immigration", "Economy", "Health")#,
        # omit.coef = "(partei_)|(kanton_)"
)


texreg::texreg(list(out_overall_ii_nb, out_overall_iib_nb, out_gender_ii_nb, out_environment_ii_nb, out_eu_ii_nb, out_immigration_ii_nb),
               file="../tables_main/diff-in-diff-mfNA-negbin.tex",
               custom.model.names = c("Overall", "Overall", "Gender", "Environment", "Europe", "Immigration"),
               custom.gof.rows = list("Month FEs" = c("\\checkmark", "\\checkmark", "\\checkmark","\\checkmark", "\\checkmark","\\checkmark"),
                                      "Weekdasy FEs" = c("", "\\checkmark", "\\checkmark", "\\checkmark", "\\checkmark", "\\checkmark")),
               custom.coef.map=list("treated"="Strike Year (2019)",
                                    "post_treatment"="After women's strike",
                                    "treated:post_treatment" = "Strike Year (2019) * After Women Strike",
                                    "(Intercept)"="(Intercept)"),
               label="table:appdiffindiff",
               float.pos="h",
               caption="Statistical negative binomial models of the number of articles mentioning at least one female candidate. Robustness check for Table \\ref{table:diffindiff} with share of all articles, rather than only those mentioning candidates.")

###############################################################################
# Test if OLS assumptions are held
###############################################################################
out_overall_ib <- lm(perc ~ treated * post_treatment + as.factor(month) + as.factor(wday), data = d_overall_i)
summary(out_overall_ib)
# --------------------------------------------------------------------------- #
# Linearity
# --------------------------------------------------------------------------- #
plot(residuals(out_overall_ib) ~ fitted(out_overall_ib), ylab="Residuals", xlab="Fitted values")
abline(h = 0, col = "red")

# If there's a discernible pattern (e.g., a curve), this suggests non-linearity.
# No curve visible 
# Assumption Holds

# --------------------------------------------------------------------------- #
# Independence
# --------------------------------------------------------------------------- #
plot(residuals(out_overall_ib) ~ d_overall_i$month, ylab="Residuals", xlab="Month")

# No Pattern as well so assumption is OK!

# --------------------------------------------------------------------------- #
# Homoscedasticity 
# --------------------------------------------------------------------------- #
plot(residuals(out_overall_ib) ~ fitted(out_overall_ib), ylab="Residuals", xlab="Fitted values")
abline(h = 0, col = "red")
library(lmtest)
bptest(out_overall_ib)

# Given the p-value of 5.936e-06, we would reject the null hypothesis of constant 
# variance in the residuals at the 0.05 significance level, 
# suggesting that heteroscedasticity is present in your regression model.

# --------------------------------------------------------------------------- #
# Normality of Residuals:
# --------------------------------------------------------------------------- #
hist(residuals(out_overall_ib), main="Histogram of Residuals", xlab="Residuals")
qqnorm(residuals(out_overall_ib))
qqline(residuals(out_overall_ib), col="red")

shapiro.test(residuals(out_overall_ib))
# W = 0.98765: This is the test statistic for the Shapiro-Wilk test. 
# The W statistic ranges between 0 and 1, with values close to 1 suggesting that 
# the distribution of the data is similar to a normal distribution.
# However, the p-value is a measure of the evidence against a specific null hypothesis. 
# In the context of the Shapiro-Wilk test, the null hypothesis is that the data is normally distributed. 
# A small p-value, typically less than 0.05, indicates that there's significant evidence against the null hypothesis of normality. 
# Our p-value of 1.775e-07 suggests that there's a statistically significant departure from perfect normality, even if that departure is subtle.
# This highlights a key point with statistical testing, especially in large samples: 
# It's possible to detect statistically significant deviations from assumptions even when those deviations are practically minor. 
# With a large enough sample, the Shapiro-Wilk test can identify even subtle deviations from normality.

# --------------------------------------------------------------------------- #
# Multicollinearity
# --------------------------------------------------------------------------- #
library(car)
vif(out_overall_ib)

# post_treatment: has a VIF value (GVIF^(1/(2*Df))) of 3.368086. 
# This suggests some multicollinearity but might not be alarmingly high based on the rule of thumb mentioned. 
# However, it's on the higher end and indicates that the variable post_treatment is somewhat linearly predictable from the other predictors in the model.

# as.factor(month): has a VIF value of 1.123633, 
# which is close to 1 and suggests that multicollinearity isn't much of a concern for this factor in your model.

